# Kernel: sgemm_tn_128x32

<CONSTANT_MAPPING>
    addr_zero : 4x<128*16*2 + 32*16*2>
    szShareA  : 128*16
    szShareB  : 32*16

    gridDimA : c[0x0][0x14]
    gridDimB : c[0x0][0x18]

    param_C[0]      : c[0x0][0x140]
    param_C[1]      : c[0x0][0x144]
    param_A[0]      : c[0x0][0x148]
    param_A[1]      : c[0x0][0x14c]
    param_B[0]      : c[0x0][0x150]
    param_B[1]      : c[0x0][0x154]
    param_alpha     : c[0x0][0x158]
    param_beta      : c[0x0][0x15c]
    param_flags     : c[0x0][0x160]
    param_lda8      : c[0x0][0x164]
    param_ldb8      : c[0x0][0x168]
    param_ldc       : c[0x0][0x16c]
    param_m         : c[0x0][0x170]
    param_n         : c[0x0][0x174]
    param_k         : c[0x0][0x178]
    param_ldaz      : c[0x0][0x17c]
    param_ldbz      : c[0x0][0x180]
    param_ldcz      : c[0x0][0x184]
    param_loops     : c[0x0][0x188]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

    32-79 ~ lda, lda4, ldb, ldaz, ldbz, ta<0-3>, tb, tid1, tidAX, tidBX, tidAY<1-3>, txa<1-3>, txb<1-3>

    0-31 : czero<00-31>

     3, 2,11,10 : cx<0-3>y0
     7, 6,15,14 : cx<0-3>y1
     1, 0, 9, 8 : cx<0-3>y2
     5, 4,13,12 : cx<0-3>y3
    19,18,27,26 : cx<0-3>y4
    23,22,31,30 : cx<0-3>y5
    17,16,25,24 : cx<0-3>y6
    21,20,29,28 : cx<0-3>y7

      32-43 : j0Ay<0-7>, j0Bx<0-3>
      44-55 : j1Ay<0-7>, j1Bx<0-3>
      56-67 : j2Ay<0-7>, j2Bx<0-3>
      68-79 : j3Ay<0-7>, j3Bx<0-3>

      80-83 : loadB<0-3>
      84-99 : load0A<0-3>, load1A<0-3>, load2A<0-3>, load3A<0-3>

    100-109 : trackB<0-1>, track0A<0-1>, track1A<0-1>, track2A<0-1>, track3A<0-1>

    110-120 ~ writeAs, writeBs, lda16, ldb16, k, tidAY, tidBY, txa, txb
    121-127 ~ swapBuf, readAs, readBs, tid, blkA, blkB, blkZ

    32-39 : C00y<0-1>, C04y<0-1>, C08y<0-1>, C12y<0-1>
    40-47 : c<0-3>, d3, d2, d1, d0
   48-120 ~ tid31, tid96, ldc, ldcz, cx, ci, xmad_c, ldc1, ldc4, ldc60, writeCs, readCs, cy<00|04|08|12>, alpha, beta, flags

</REGISTER_MAPPING>

-:-:-:-:00 S2R tid,  SR_TID.X;
-:-:-:-:00 S2R blkA, SR_CTAID.Y;
-:-:-:-:00 S2R blkB, SR_CTAID.Z;
-:-:-:-:00 S2R blkZ, SR_CTAID.X;

-:-:-:-:00 MOV k,    param_k;
-:-:-:-:00 MOV lda,  param_lda8;
-:-:-:-:00 MOV ldb,  param_ldb8;
-:-:-:-:00 SHR.U32 lda, lda, 5;
-:-:-:-:00 SHR.U32 ldb, ldb, 5;
-:-:-:-:00 MOV ldaz, param_ldaz;
-:-:-:-:00 MOV ldbz, param_ldbz;
-:-:-:-:00 SHL lda16, lda, 6;
-:-:-:-:00 SHL ldb16, ldb, 6;
-:-:-:-:00 SHL lda4,  lda, 2;

-:-:-:-:00 STS.128 [addr_zero], RZ;
<CODE>
    return join '', map sprintf("-:-:-:-:00 LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7;
</CODE>

// tidAX = (tid & 31) << 2
// tidAY = (tid >> 5)
-:-:-:-:00 LOP.AND tidAX, tid,   31;
-:-:-:-:00 SHL     tidAX, tidAX, 2;
-:-:-:-:00 SHR.U32 tidAY, tid,   5;

// tidBX = (tid & 7) << 2
// tidBY = (tid >> 3)
-:-:-:-:00 LOP.AND tidBX, tid,   7;
-:-:-:-:00 SHL     tidBX, tidBX, 2;
-:-:-:-:00 SHR.U32 tidBY, tid,   3;

// trackA += (blkA*128 + tidAX + lda*tidAY) * 4
-:-:-:-:00 ISCADD   txa, blkA, tidAX, 7;
-:-:-:-:00 XMAD.LO2 ta0, lda,  tidAY, txa;
-:-:-:-:00 XMAD.LO2 ta0, ldaz, blkZ,  ta0;
-:-:-:-:00 IADD     ta1, ta0, lda4;
-:-:-:-:00 IADD     ta2, ta1, lda4;
-:-:-:-:00 IADD     ta3, ta2, lda4;

-:-:-:-:00 LEA      track0A0.CC, ta0, param_A[0],     2;
-:-:-:-:00 LEA.HI.X track0A1,    ta0, param_A[1], RZ, 2;
-:-:-:-:00 LEA      track1A0.CC, ta1, param_A[0],     2;
-:-:-:-:00 LEA.HI.X track1A1,    ta1, param_A[1], RZ, 2;
-:-:-:-:00 LEA      track2A0.CC, ta2, param_A[0],     2;
-:-:-:-:00 LEA.HI.X track2A1,    ta2, param_A[1], RZ, 2;
-:-:-:-:00 LEA      track3A0.CC, ta3, param_A[0],     2;
-:-:-:-:00 LEA.HI.X track3A1,    ta3, param_A[1], RZ, 2;

// trackB += (blkB*32 + ldb*tidBY + tidBX) * 4
-:-:-:-:00 ISCADD   txb, blkB, tidBX,  5;
-:-:-:-:00 XMAD.LO2 tb,  ldb,  tidBY, txb;
-:-:-:-:00 XMAD.LO2 tb,  ldbz, blkZ,  tb;
-:-:-:-:00 LEA      trackB0.CC, tb, param_B[0],     2;
-:-:-:-:00 LEA.HI.X trackB1,    tb, param_B[1], RZ, 2;

// writeAs = (tidAY*128 + tidAX) * 4
-:-:-:-:00 ISCADD writeAs, tidAY, tidAX, 7;
-:-:-:-:00 ISCADD writeAs, writeAs, 4x<szShareA + szShareB>, 2;

// writeBs = (tidBY*32 + tidBX) * 4
-:-:-:-:00 ISCADD writeBs, tidBY, tidBX, 5;
-:-:-:-:00 ISCADD writeBs, writeBs, 4x<szShareA*2 + szShareB>, 2;

// readAs  = (((tid & 0x70) >> 3) | (tid & 1)) << 4;
-:-:-:-:00 LOP.AND tid1,   tid,    1;
-:-:-:-:00 LOP.AND readAs, tid,    0x70;
-:-:-:-:00 SHR.U32 readAs, readAs, 3;
-:-:-:-:00 LOP.OR  readAs, readAs, tid1;
-:-:-:-:00 SHL     readAs, readAs, 4;
// readBs = ((tid >> 1) & 7) << 4 + 4x<128*8>;
-:-:-:-:00 BFE.U32 readBs, tid,    0x301; // 3 bits at position 1
-:-:-:-:00 ISCADD  readBs, readBs, 4x<szShareA>, 4;

-:-:-:-:00 MOV32I swapBuf, -4x<szShareA + szShareB>;

REMAINDER:

-:-:-:-:00 IADD tidAY1, tidAY, 4;
-:-:-:-:00 IADD tidAY2, tidAY, 8;
-:-:-:-:00 IADD tidAY3, tidAY, 12;

<CODE>
    our $vec;
    return $vec ? q{
-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;

-:-:-:-:00 ISETP.LT.AND P0, PT, tidAY,  k, P5;
-:-:-:-:00 ISETP.LT.AND P1, PT, tidAY1, k, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, tidAY2, k, P5;
-:-:-:-:00 ISETP.LT.AND P3, PT, tidAY3, k, P5;
-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY,  k, P6;

-:-:-:-:00 @P0 LDG.E.CI.128 load0A, [track0A];
-:-:-:-:00 @P1 LDG.E.CI.128 load1A, [track1A];
-:-:-:-:00 @P2 LDG.E.CI.128 load2A, [track2A];
-:-:-:-:00 @P3 LDG.E.CI.128 load3A, [track3A];
-:-:-:-:00 @P4 LDG.E.CI.128 loadB,  [trackB];

-:-:-:-:00 @!P0 LDS.U.128 load0A, [addr_zero];
-:-:-:-:00 @!P1 LDS.U.128 load1A, [addr_zero];
-:-:-:-:00 @!P2 LDS.U.128 load2A, [addr_zero];
-:-:-:-:00 @!P3 LDS.U.128 load3A, [addr_zero];
-:-:-:-:00 @!P4 LDS.U.128 loadB,  [addr_zero];

   } : q{

-:-:-:-:00 IADD txa1,  txa,  1;
-:-:-:-:00 IADD txa2,  txa,  2;
-:-:-:-:00 IADD txa3,  txa,  3;

-:-:-:-:00 ISETP.LT.AND P4, PT, tidAY, k, PT;
-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P4;

-:-:-:-:00 @P0 LDG.E.CI load0A0, [track0A + 4x<0>];
-:-:-:-:00 @P1 LDG.E.CI load0A1, [track0A + 4x<1>];
-:-:-:-:00 @P2 LDG.E.CI load0A2, [track0A + 4x<2>];
-:-:-:-:00 @P3 LDG.E.CI load0A3, [track0A + 4x<3>];

-:-:-:-:00 @!P0 MOV load0A0, RZ;
-:-:-:-:00 @!P1 MOV load0A1, RZ;
-:-:-:-:00 @!P2 MOV load0A2, RZ;
-:-:-:-:00 @!P3 MOV load0A3, RZ;

-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY1, k, PT;
-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P5;
-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5;

-:-:-:-:00 @P0 LDG.E.CI load1A0, [track1A + 4x<0>];
-:-:-:-:00 @P1 LDG.E.CI load1A1, [track1A + 4x<1>];
-:-:-:-:00 @P2 LDG.E.CI load1A2, [track1A + 4x<2>];
-:-:-:-:00 @P3 LDG.E.CI load1A3, [track1A + 4x<3>];

-:-:-:-:00 @!P0 MOV load1A0, RZ;
-:-:-:-:00 @!P1 MOV load1A1, RZ;
-:-:-:-:00 @!P2 MOV load1A2, RZ;
-:-:-:-:00 @!P3 MOV load1A3, RZ;

-:-:-:-:00 ISETP.LT.AND P6, PT, tidAY2, k, PT;
-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P6;
-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P6;
-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P6;
-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P6;

-:-:-:-:00 @P0 LDG.E.CI load2A0, [track2A + 4x<0>];
-:-:-:-:00 @P1 LDG.E.CI load2A1, [track2A + 4x<1>];
-:-:-:-:00 @P2 LDG.E.CI load2A2, [track2A + 4x<2>];
-:-:-:-:00 @P3 LDG.E.CI load2A3, [track2A + 4x<3>];

-:-:-:-:00 @!P0 MOV load2A0, RZ;
-:-:-:-:00 @!P1 MOV load2A1, RZ;
-:-:-:-:00 @!P2 MOV load2A2, RZ;
-:-:-:-:00 @!P3 MOV load2A3, RZ;

-:-:-:-:00 ISETP.LT.AND P5, PT, tidAY3, k, PT;
-:-:-:-:00 ISETP.LT.AND P0, PT, txa,  param_m, P5;
-:-:-:-:00 ISETP.LT.AND P1, PT, txa1, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P2, PT, txa2, param_m, P5;
-:-:-:-:00 ISETP.LT.AND P3, PT, txa3, param_m, P5;

-:-:-:-:00 @P0 LDG.E.CI load3A0, [track3A + 4x<0>];
-:-:-:-:00 @P1 LDG.E.CI load3A1, [track3A + 4x<1>];
-:-:-:-:00 @P2 LDG.E.CI load3A2, [track3A + 4x<2>];
-:-:-:-:00 @P3 LDG.E.CI load3A3, [track3A + 4x<3>];

-:-:-:-:00 @!P0 MOV load3A0, RZ;
-:-:-:-:00 @!P1 MOV load3A1, RZ;
-:-:-:-:00 @!P2 MOV load3A2, RZ;
-:-:-:-:00 @!P3 MOV load3A3, RZ;

-:-:-:-:00 IADD txb1,  txb,  1;
-:-:-:-:00 IADD txb2,  txb,  2;
-:-:-:-:00 IADD txb3,  txb,  3;

-:-:-:-:00 ISETP.LT.AND P4, PT, tidBY, k, PT;
-:-:-:-:00 ISETP.LT.AND P0, PT, txb,  param_n, P4;
-:-:-:-:00 ISETP.LT.AND P1, PT, txb1, param_n, P4;
-:-:-:-:00 ISETP.LT.AND P2, PT, txb2, param_n, P4;
-:-:-:-:00 ISETP.LT.AND P3, PT, txb3, param_n, P4;

-:-:-:-:00 @P0 LDG.E.CI loadB0, [trackB + 4x<0>];
-:-:-:-:00 @P1 LDG.E.CI loadB1, [trackB + 4x<1>];
-:-:-:-:00 @P2 LDG.E.CI loadB2, [trackB + 4x<2>];
-:-:-:-:00 @P3 LDG.E.CI loadB3, [trackB + 4x<3>];

-:-:-:-:00 @!P0 MOV loadB0, RZ;
-:-:-:-:00 @!P1 MOV loadB1, RZ;
-:-:-:-:00 @!P2 MOV loadB2, RZ;
-:-:-:-:00 @!P3 MOV loadB3, RZ;

-:-:-:-:00 ISETP.LT.AND P5, PT, txa, param_m, PT;
-:-:-:-:00 ISETP.LT.AND P6, PT, txb, param_n, PT;
    };
</CODE>

-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P5;
-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P5;
-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5;
-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6;

// bDoRemainder = k & 15 && k > 16
-:-:-:-:00 LOP.AND.NZ P1, RZ, k, 15;

-:-:-:-:00 STS.128 [writeAs + 4x<0*128>], load0A;
-:-:-:-:00 IADD   track0A0.CC, track0A0, lda16;
-:-:-:-:00 IADD.X track0A1,    track0A1, RZ;

-:-:-:-:00 STS.128 [writeAs + 4x<4*128>], load1A;
-:-:-:-:00 IADD   track1A0.CC, track1A0, lda16;
-:-:-:-:00 IADD.X track1A1,    track1A1, RZ;

-:-:-:-:00 STS.128 [writeAs + 4x<8*128>], load2A;
-:-:-:-:00 IADD   track2A0.CC, track2A0, lda16;
-:-:-:-:00 IADD.X track2A1,    track2A1, RZ;

-:-:-:-:00 STS.128 [writeAs + 4x<12*128>], load3A;
-:-:-:-:00 IADD   track3A0.CC, track3A0, lda16;
-:-:-:-:00 IADD.X track3A1,    track3A1, RZ;

-:-:-:-:00 STS.128 [writeBs], loadB;
-:-:-:-:00 IADD   trackB0.CC, trackB0, ldb16;

-:-:-:-:00 ISETP.GT.AND P1, PT, k, 16, P1;

-:-:-:-:00 IADD readBs,  readBs, -swapBuf;
-:-:-:-:00 IADD readAs,  readAs, -swapBuf;
-:-:-:-:00 BAR.SYNC 0;
-:-:-:-:00 IADD writeBs, writeBs, swapBuf;
-:-:-:-:00 IADD writeAs, writeAs, swapBuf;
-:-:-:-:00 IADD swapBuf, RZ, -swapBuf;

-:-:-:-:00 IADD.X trackB1,    trackB1, RZ;

<CODE>
    our $vec;
    return $vec ? q{
-:-:-:-:00 @P5 LDG.E.CI.128 load0A, [track0A];
-:-:-:-:00 @P5 LDG.E.CI.128 load1A, [track1A];
-:-:-:-:00 @P5 LDG.E.CI.128 load2A, [track2A];
-:-:-:-:00 @P5 LDG.E.CI.128 load3A, [track3A];
-:-:-:-:00 @P6 LDG.E.CI.128 loadB,  [trackB];
   } : q{
-:-:-:-:00 @P5 LDG.E.CI load0A0, [track0A + 4x<0>];
-:-:-:-:00 @P5 LDG.E.CI load0A1, [track0A + 4x<1>];
-:-:-:-:00 @P5 LDG.E.CI load0A2, [track0A + 4x<2>];
-:-:-:-:00 @P5 LDG.E.CI load0A3, [track0A + 4x<3>];

-:-:-:-:00 @P5 LDG.E.CI load1A0, [track1A + 4x<0>];
-:-:-:-:00 @P5 LDG.E.CI load1A1, [track1A + 4x<1>];
-:-:-:-:00 @P5 LDG.E.CI load1A2, [track1A + 4x<2>];
-:-:-:-:00 @P5 LDG.E.CI load1A3, [track1A + 4x<3>];

-:-:-:-:00 @P5 LDG.E.CI load2A0, [track2A + 4x<0>];
-:-:-:-:00 @P5 LDG.E.CI load2A1, [track2A + 4x<1>];
-:-:-:-:00 @P5 LDG.E.CI load2A2, [track2A + 4x<2>];
-:-:-:-:00 @P5 LDG.E.CI load2A3, [track2A + 4x<3>];

-:-:-:-:00 @P5 LDG.E.CI load3A0, [track3A + 4x<0>];
-:-:-:-:00 @P5 LDG.E.CI load3A1, [track3A + 4x<1>];
-:-:-:-:00 @P5 LDG.E.CI load3A2, [track3A + 4x<2>];
-:-:-:-:00 @P5 LDG.E.CI load3A3, [track3A + 4x<3>];

-:-:-:-:00 @P6 LDG.E.CI loadB0, [trackB + 4x<0>];
-:-:-:-:00 @P6 LDG.E.CI loadB1, [trackB + 4x<1>];
-:-:-:-:00 @P6 LDG.E.CI loadB2, [trackB + 4x<2>];
-:-:-:-:00 @P6 LDG.E.CI loadB3, [trackB + 4x<3>];
    };
</CODE>

<CODE>
    our $vec;
    our $shiftAX = 0;
    our $shiftBX = 0;
    our %insert =
    (
        j0c6   => "-:-:-:-:00 IADD k, k, -16;\n",
        j0c14  => "-:-:-:-:00 ISETP.GE.AND P0, PT, k, 16, PT;\n",

        j3c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 0*128>], load0A;\n",
        j5c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 4*128>], load1A;\n",
        j7c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x< 8*128>], load2A;\n",
        j9c6   => "-:-:-:-:00 \@P0 STS.128 [writeAs + 4x<12*128>], load3A;\n",
        j11c6  => "-:-:-:-:00 \@P0 STS.128 [writeBs], loadB;\n",

        j3c7   => "-:-:-:-:00 \@P2 IADD   track0A0.CC, track0A0, lda16;\n",
        j3c13  => "-:-:-:-:00 \@P2 IADD.X track0A1,    track0A1, RZ;\n",
        j5c7   => "-:-:-:-:00 \@P3 IADD   track1A0.CC, track1A0, lda16;\n",
        j5c13  => "-:-:-:-:00 \@P3 IADD.X track1A1,    track1A1, RZ;\n",
        j7c7   => "-:-:-:-:00 \@P5 IADD   track2A0.CC, track2A0, lda16;\n",
        j7c13  => "-:-:-:-:00 \@P5 IADD.X track2A1,    track2A1, RZ;\n",
        j9c7   => "-:-:-:-:00 \@P5 IADD   track3A0.CC, track3A0, lda16;\n",
        j9c13  => "-:-:-:-:00 \@P5 IADD.X track3A1,    track3A1, RZ;\n",
        j11c7  => "-:-:-:-:00 \@P6 IADD   trackB0.CC,  trackB0,  ldb16;\n",
        j11c13 => "-:-:-:-:00 \@P6 IADD.X trackB1,     trackB1,  RZ;\n",

        j3c14  => "-:-:-:-:00 ISETP.GE.AND P2, PT, k, 32, P2;\n",
        j5c14  => "-:-:-:-:00 ISETP.GE.AND P3, PT, k, 32, P3;\n",
        j9c14  => "-:-:-:-:00 ISETP.GE.AND P5, PT, k, 32, P5;\n",
        j11c14 => "-:-:-:-:00 ISETP.GE.AND P6, PT, k, 32, P6;\n",

        j13c31 => "-:-:-:-:00 \@P0 BAR.SYNC 0;\n" .
                  "-:-:-:-:00 \@P0 IADD readAs,  readAs, -swapBuf;\n" .
                  "-:-:-:-:00 \@P0 IADD readBs,  readBs, -swapBuf;\n" .
                  "-:-:-:-:00 \@P0 IADD writeAs, writeAs, swapBuf;\n" .
                  "-:-:-:-:00 \@P0 IADD writeBs, writeBs, swapBuf;\n" .
                  "-:-:-:-:00 \@P0 IADD swapBuf, RZ,     -swapBuf;\n",

        ($vec ?
            (
                j3c29  => "-:-:-:-:00 \@P2 LDG.E.CI.128 load0A, [track0A];\n",
                j5c29  => "-:-:-:-:00 \@P3 LDG.E.CI.128 load1A, [track1A];\n",
                j9c29  => "-:-:-:-:00 \@P5 LDG.E.CI.128 load2A, [track2A];\n",
                j9c31  => "-:-:-:-:00 \@P5 LDG.E.CI.128 load3A, [track3A];\n",
                j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI.128 loadB,  [trackB];\n",
            ) :
            (
                j3c29  => "-:-:-:-:00 \@P2 LDG.E.CI load0A0, [track0A + 4x<0>];\n",
                j3c31  => "-:-:-:-:00 \@P2 LDG.E.CI load0A1, [track0A + 4x<1>];\n",
                j4c1   => "-:-:-:-:00 \@P2 LDG.E.CI load0A2, [track0A + 4x<2>];\n",
                j4c3   => "-:-:-:-:00 \@P2 LDG.E.CI load0A3, [track0A + 4x<3>];\n",

                j5c29  => "-:-:-:-:00 \@P3 LDG.E.CI load1A0, [track1A + 4x<0>];\n",
                j5c31  => "-:-:-:-:00 \@P3 LDG.E.CI load1A1, [track1A + 4x<1>];\n",
                j6c1   => "-:-:-:-:00 \@P3 LDG.E.CI load1A2, [track1A + 4x<2>];\n",
                j6c3   => "-:-:-:-:00 \@P3 LDG.E.CI load1A3, [track1A + 4x<3>];\n",

                j9c29  => "-:-:-:-:00 \@P5 LDG.E.CI load2A0, [track2A + 4x<0>];\n",
                j9c31  => "-:-:-:-:00 \@P5 LDG.E.CI load2A1, [track2A + 4x<1>];\n",
                j10c1  => "-:-:-:-:00 \@P5 LDG.E.CI load2A2, [track2A + 4x<2>];\n",
                j10c3  => "-:-:-:-:00 \@P5 LDG.E.CI load2A3, [track2A + 4x<3>];\n",

                j10c8  => "-:-:-:-:00 \@P5 LDG.E.CI load3A0, [track3A + 4x<0>];\n",
                j10c10 => "-:-:-:-:00 \@P5 LDG.E.CI load3A1, [track3A + 4x<1>];\n",
                j10c12 => "-:-:-:-:00 \@P5 LDG.E.CI load3A2, [track3A + 4x<2>];\n",
                j10c14 => "-:-:-:-:00 \@P5 LDG.E.CI load3A3, [track3A + 4x<3>];\n",

                j11c29 => "-:-:-:-:00 \@P6 LDG.E.CI loadB0, [trackB + 4x<0>];\n",
                j11c31 => "-:-:-:-:00 \@P6 LDG.E.CI loadB1, [trackB + 4x<1>];\n",
                j12c1  => "-:-:-:-:00 \@P6 LDG.E.CI loadB2, [trackB + 4x<2>];\n",
                j12c3  => "-:-:-:-:00 \@P6 LDG.E.CI loadB3, [trackB + 4x<3>];\n",
            )
        ),

        j15c31 => "-:-:-:-:00 \@P0 BRA.U LOOP;\n" .
                  "-:-:-:-:00 \@P1 BRA.U REMAINDER;\n"
    );
    return ;
</CODE>

<INCLUDE file="sgemm_common_128x32.sass"/>
